Digitised books#

import pandas as pd
import altair as alt
from IPython.display import HTML
alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
df = pd.read_csv("https://raw.githubusercontent.com/GLAM-Workbench/trove-books-data/main/trove-books.csv", keep_default_na=False)
df.shape
(21218, 24)
df.loc[df["text_file"] != ""].shape
(17692, 24)
df["rights"].str.split(" | ", regex=False).explode().value_counts()
rights
http://rightsstatements.org/vocab/NKC/1.0/             17300
Out of Copyright                                       15848
No known copyright restrictions                         4492
                                                         433
In Copyright                                             397
Perpetual                                                 46
Mixed copyright                                            2
Out of copyright                                           2
http://creativecommons.org/licenses/by-nc-nd/3.0/au        1
Name: count, dtype: int64
languages = df["language"].str.split(" | ", regex=False).explode()
languages.value_counts()[:10]
language
English                 17903
Chinese                  1685
                          518
French                    256
Undetermined              162
German                    132
Japanese                   95
Italian                    89
Austronesian (Other)       80
Dutch                      77
Name: count, dtype: int64
languages.nunique()
251
language_counts = languages.value_counts().to_dict()
del(language_counts["English"])
from wordcloud import WordCloud

wc = WordCloud(width=800, height=400)
wc.generate_from_frequencies(language_counts)
wc.to_image()
../../_images/55587a0f8ffdedd8782a2c7e3a52301899c2578207c3b80a89f42cb957fbef93.png
df["pages"]
0        130
1         24
2         24
3         65
4        246
        ... 
21213     86
21214     52
21215     60
21216      6
21217     44
Name: pages, Length: 21218, dtype: int64
df["year"] = df["date"].str.extract(r"\b((?:16|17|18|19|20)\d{2})\b")
year_counts = df["year"].value_counts().to_frame().reset_index()
alt.Chart(year_counts).mark_bar(size=1).encode(
    x="year:T",
    y="count:Q"
).properties(width=800, height=200)
alt.Chart(df).mark_bar().encode(
    x=alt.X("pages:Q").bin(),
    y="count()"
).properties(width=800, height=200)
df.loc[df["pages"].idxmax()]
title                                   Jiu xin yue quan shu. | 舊新約全書.
sub_unit                                                              
contributor          British and Foreign Bible Society | British an...
publisher            Sheng shu gong hui | [Guangzhou] : Sheng shu g...
date                                                              1907
type                                                                  
format                                                            Book
extent                         1308, 400 pages ; 23 cm. | 703718 words
language                                                       Chinese
subject                                                               
spatial                                                               
is_part_of                                                            
identifier           AuCNLKIN: 000015575462 | LMS 545 | OCoLC: 2224...
rights               No known copyright restrictions | http://right...
pages                                                             1723
fulltext_url                       https://nla.gov.au/nla.obj-79373227
fulltext_url_text         National Library of Australia digitised item
catalogue_url                      https://nla.gov.au/nla.cat-vn483026
work_url                        https://trove.nla.gov.au/work/12620536
work_type                                                         Book
parent                                                                
parent_url                                                            
children                                                              
text_file                                         nla.obj-79373227.txt
year                                                              1907
Name: 2310, dtype: object
import re


def split_and_clean(value):
    values = value.split("|")
    return list(
        set([re.sub(r"(\w)--(\w)", r"\1 -- \2", v).strip().strip(".") for v in values if v])
    )


df["subjects"] = df["subject"].apply(split_and_clean)

subjects = df["subjects"].explode().to_frame()
# Remove trailing full stops
#subjects["subject"] = subjects["subject"].str.strip(".")
subjects["subjects"].value_counts().to_frame().reset_index()[:20].style.format(
    thousands=","
).hide()
subjects count
Australian 3,457
Printed ephemera -- Australia 2,090
Menus -- Specimens 963
Chinese language -- Dictionaries 610
Menus - Shipping (general, Orient, P&O) 480
Menus -- Australia -- Specimens 470
Invitation cards -- Australia 423
Programs -- Australia 419
Australia 369
Menus - General 335
Menus - Specific events 242
Commercial catalogs -- Australia 225
Business -- Australia -- Miscellanea 224
Australia -- Commerce -- Miscellanea 224
Advertising -- Australia -- Miscellanea 219
Retail trade -- Australia -- Miscellanea 203
Trade literature -- Australia 201
Industries -- Australia -- Miscellanea 198
Advertising -- Brand name products -- Australia 181
Broadsides -- Australia 169
random_sample = df.loc[df["subject"].str.contains("Menus")].sample(5)[["title", "contributor", "date", "fulltext_url"]]
random_sample["thumbnail"] = random_sample["fulltext_url"].apply(lambda x: f"{x}-t")
random_sample = random_sample[
    ["thumbnail", "title", "contributor", "date", "fulltext_url"]
]


def to_img_tag(path):
    return '<img src="' + path + '" width="50" >'


display(
    HTML(
        random_sample.to_html(
            escape=False,
            formatters={"thumbnail": to_img_tag},
            index=False,
            render_links=True,
        )
    )
)
thumbnail title contributor date fulltext_url
[Menus - General : ephemera material collected by the National Library of Australia]. 1900 - https://nla.gov.au/nla.obj-53483422
Sitmar Cruises - T.S.S. Fairstar - 24 February 1977 | [Menus - Shipping (general, Orient, P&O) : ephemera material collected by the National Library of Australia]. 1900 - https://nla.gov.au/nla.obj-53632677
[Menus - General : ephemera material collected by the National Library of Australia]. 1900 - https://nla.gov.au/nla.obj-53482817
[Menus - Specific events : ephemera material collected by the National Library of Australia]. 1900 - https://nla.gov.au/nla.obj-54292748
[Menus - Specific events : ephemera material collected by the National Library of Australia]. 1900 - https://nla.gov.au/nla.obj-54292942